################################################################################
## Paper:     Working the Crowd
## Authors:   P. Mongrain, N. Fréchet, B. Thompson Collart, and Y. Dufresne
## Date:      February 2025
################################################################################

################################################################################
## IMPORTANT NOTE
################################################################################

# Please run the "merge.do" *before* running the code

################################################################################
## SET WORKING DIRECTORY
################################################################################

getwd()

setwd("C:/...") #add appropriate file path

################################################################################
## LOAD PACKAGES 
################################################################################

#remotes::install_github("NightingaleHealth/ggforestplot")

library(cowplot)
library(devtools)
library(dfoptim)
library(dotwhisker)
library(dplyr) 
library(effects)
library(emmeans)
library(expss)
library(ggalt)
library(ggeasy)
library(ggeffects)
library(ggforestplot)
library(ggplot2)
library(ggpubr)
library(ggridges)
library(grid)
library(gridExtra)
library(haven)
library(interplot)
library(lattice)
library(lme4)
library(lubridate)
library(magrittr)
library(marginaleffects)
library(margins)
library(merTools)
library(prettyR)
library(psych)
library(rms)
library(scales)
library(sjmisc)
library(sjPlot)
library(srvyr)
library(tibble)
library(tidyverse)
library(visreg)

if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, lme4, marginaleffects, optimx, broom.mixed)

################################################################################
## LOAD DATA
################################################################################

## IMPORT DATA FROM STATA

merge <- read_dta("merge.dta")

## CREATE SEPARATE DATAFRAME FOR EACH ELECTION

ca2011.data <- merge %>% filter(election == "ca2011")
ca2015.data <- merge %>% filter(election == "ca2015")
ca2019.data <- merge %>% filter(election == "ca2019")
on2011.data <- merge %>% filter(election == "on2011")
on2014.data <- merge %>% filter(election == "on2014")
qc2022.data <- merge %>% filter(election == "qc2022")

## DEFINE VARIABLES AS FACTOR OR NUMERIC

ca2019.data$correct_district <- as.factor(ca2019.data$correct_district)
ca2019.data$vote_district <- as.factor(ca2019.data$vote_district)
ca2019.data$pidstatus_district <- as.factor(ca2019.data$pidstatus_district)
ca2019.data$univ <- as.factor(ca2019.data$univ)
ca2019.data$male <- as.factor(ca2019.data$male)
ca2019.data$age55 <- as.factor(ca2019.data$age55)
ca2019.data$highinc <- as.factor(ca2019.data$highinc)
ca2019.data$reelected <- as.factor(ca2019.data$reelected)
ca2019.data$margin <- as.numeric(ca2019.data$margin)
ca2019.data$interest_n <- as.numeric(ca2019.data$interest_n)
ca2019.data$vote_district <- factor(ca2019.data$vote_district, levels = c(0,1),
                                    labels = c("No", "Yes"))

## CREATE MODE FUNCTION

getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

################################################################################
## RANDOM INTERCEPT
################################################################################

## Estimate model

ca2019.data = ca2019.data %>%
    mutate(margin = (margin - mean(margin)) / sd(margin)) #scaling margin

ca2019.data = ca2019.data %>%
  mutate(time = (time - mean(time)) / sd(time)) #scaling response date

pacman::p_load(lmerTest) #show statistical significance

ca2019.ri.fit <- glmer(correct_district ~ 1 + vote_district*univ + male + 
                     age55 + highinc + reelected + margin + time +
                     (1 | district_code), weight=survey_weight, 
                     data=ca2019.data, 
                     family=binomial(link="logit"))

summary(ca2019.ri.fit, correlation = FALSE)

ca2019.ri.int1 <- emmeans(ca2019.ri.fit, ~ univ | vote_district) #first difference
ca2019.ri.int2 <- pairs(pairs(emmeans::regrid(ca2019.ri.int1)), by = NULL) #second difference

ca2019.ri.int.em1 <- emmeans(ca2019.ri.fit, pairwise ~ univ | vote_district)
ca2019.ri.int.em2 <- pairs(ca2019.ri.int.em1) %>%  confint()

## Extracting random effects

ranef(ca2019.ri.fit, 
      drop = TRUE, #to have them as a vector
      condVar = FALSE)

## Plotting random effects

broom.mixed::tidy(ca2019.ri.fit, effects = "ran_vals") %>%
    ggplot(aes(x = reorder(level, -estimate), y = estimate)) +
    geom_hline(yintercept = 0, color = "grey70") +
    geom_point(color = "cyan4") +
    geom_errorbar(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error), 
                  width = 0, color = "cyan4") +
    labs(x = "District", y = "RE estimate") +
    scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
    theme_classic() +
    coord_flip()

################################################################################
## RANDOM SLOPE
################################################################################

ca2019.rs.fit.bob <- glmer(correct_district ~ 1 + vote_district*univ + male + 
                    age55 + highinc + margin + time + reelected + 
                    (1 + vote_district*univ | district_code), weight=survey_weight,
                    control = glmerControl(optimizer = "bobyqa"),
                    data=ca2019.data, 
                    family=binomial(link="logit"))

summary(ca2019.rs.fit.bob, correlation = FALSE)

ca2019.rs.int1 <- emmeans(ca2019.rs.fit.bob, ~ univ | vote_district) #first difference
ca2019.rs.int2 <- pairs(pairs(emmeans::regrid(ca2019.rs.int1)), by = NULL) #second difference

ca2019.rs.int.em1 <- emmeans(ca2019.rs.fit.bob, pairwise ~ univ | vote_district)
ca2019.rs.int.em2 <- pairs(ca2019.rs.int.em1) %>% confint()

################################################################################
## PREDICTED PROBABILITIES
################################################################################

ca2019.data <- filter(ca2019.data, !is.na(correct_district))
ca2019.data <- filter(ca2019.data, !is.na(univ))
ca2019.data <- filter(ca2019.data, !is.na(vote_district))
ca2019.data <- filter(ca2019.data, !is.na(male))
ca2019.data <- filter(ca2019.data, !is.na(age55))
ca2019.data <- filter(ca2019.data, !is.na(highinc))
ca2019.data <- filter(ca2019.data, !is.na(reelected))
ca2019.data <- filter(ca2019.data, !is.na(margin))

ca2019.rs.preds.1 <- with(ca2019.data, data.frame(district_code=unique(district_code), 
                                                  univ="0", 
                                                  interest_n=0, 
                                                  vote_district="No",
                                                  male=getmode(male),
                                                  age55=getmode(age55),
                                                  highinc=getmode(highinc),
                                                  reelected=getmode(reelected), 
                                                  margin=0,
                                                  time=0))

ca2019.rs.preds.2 <- with(ca2019.data, data.frame(district_code=unique(district_code), 
                                                  univ="1", 
                                                  interest_n=1, 
                                                  vote_district="No",
                                                  male=getmode(male),
                                                  age55=getmode(age55),
                                                  highinc=getmode(highinc),
                                                  reelected=getmode(reelected), 
                                                  margin=0,
                                                  time=0))

ca2019.rs.preds.3 <- with(ca2019.data, data.frame(district_code=unique(district_code), 
                                                  univ="0", 
                                                  interest_n=0, 
                                                  vote_district="Yes",
                                                  male=getmode(male),
                                                  age55=getmode(age55),
                                                  highinc=getmode(highinc),
                                                  reelected=getmode(reelected), 
                                                  margin=0,
                                                  time=0))

ca2019.rs.preds.4 <- with(ca2019.data, data.frame(district_code=unique(district_code), 
                                                  univ="1", 
                                                  interest_n=1, 
                                                  vote_district="Yes",
                                                  male=getmode(male),
                                                  age55=getmode(age55),
                                                  highinc=getmode(highinc),
                                                  reelected=getmode(reelected), 
                                                  margin=0,
                                                  time=0))

ca2019.rs.preds.1$preds = predict(ca2019.rs.fit.bob, newdata = ca2019.rs.preds.1, type = "response")
ca2019.rs.preds.2$preds = predict(ca2019.rs.fit.bob, newdata = ca2019.rs.preds.2, type = "response")
ca2019.rs.preds.3$preds = predict(ca2019.rs.fit.bob, newdata = ca2019.rs.preds.3, type = "response")
ca2019.rs.preds.4$preds = predict(ca2019.rs.fit.bob, newdata = ca2019.rs.preds.4, type = "response")

ca2019.rs.preds.1 <- ca2019.rs.preds.1 %>% mutate(pr.type = 1)
ca2019.rs.preds.1 <- ca2019.rs.preds.1 %>% arrange(preds)
ca2019.rs.preds.1$id <- seq(1, nrow(ca2019.rs.preds.1))
order.district.loser <- subset(ca2019.rs.preds.1, select = c(district_code, id))

ca2019.rs.preds.2.2 <- ca2019.rs.preds.2 %>% mutate(pr.type = 88)
ca2019.rs.preds.2 <- ca2019.rs.preds.2 %>% mutate(pr.type = 2)
ca2019.rs.preds.2 <- ca2019.rs.preds.2 %>% arrange(preds)
ca2019.rs.preds.2$id <- seq(1, nrow(ca2019.rs.preds.2))

ca2019.rs.preds.2.2 <- merge(ca2019.rs.preds.2.2, order.district.loser, by="district_code")
ca2019.rs.preds.2.2$id <- seq(1, nrow(ca2019.rs.preds.2.2))

ca2019.rs.preds.3 <- ca2019.rs.preds.3 %>% mutate(pr.type = 3)
ca2019.rs.preds.3 <- ca2019.rs.preds.3 %>% arrange(preds)
ca2019.rs.preds.3$id <- seq(1, nrow(ca2019.rs.preds.3))
order.district.ind <- subset(ca2019.rs.preds.3, select = c(district_code, id))

ca2019.rs.preds.4.2 <- ca2019.rs.preds.4 %>% mutate(pr.type = 99)
ca2019.rs.preds.4 <- ca2019.rs.preds.4 %>% mutate(pr.type = 4)
ca2019.rs.preds.4 <- ca2019.rs.preds.4 %>% arrange(preds)
ca2019.rs.preds.4$id <- seq(1, nrow(ca2019.rs.preds.4))

ca2019.rs.preds.4.2 <- merge(ca2019.rs.preds.4.2, order.district.ind, by="district_code")
ca2019.rs.preds.4.2$id <- seq(1, nrow(ca2019.rs.preds.4.2))

ca2019.rs.preds.loser <- rbind(ca2019.rs.preds.1, ca2019.rs.preds.2, ca2019.rs.preds.2.2)
ca2019.rs.preds.winner <- rbind(ca2019.rs.preds.3, ca2019.rs.preds.4, ca2019.rs.preds.4.2)

ca2019.rs.preds.loser$pr.type <- as.factor(ca2019.rs.preds.loser$pr.type)
ca2019.rs.preds.winner$pr.type <- as.factor(ca2019.rs.preds.winner$pr.type)

################################################################################
## FIGURE 3: CANADA 2019
################################################################################

ca2019.rs.preds.loser$title <- "(a) Losers"

ca2019.rs.preds.loser.g <- ggplot(ca2019.rs.preds.loser, aes(x = reorder(id, -id), 
                    y = preds)) + geom_point(aes(color = pr.type, alpha = pr.type), show.legend = FALSE) + 
                    labs(x = "District", y = "Predicted Probability of Correct Forecast") +
                    scale_x_discrete(guide = guide_axis(n.dodge = 2), expand = expansion(mult = c(.02, .02))) +
                    scale_y_continuous("Predicted Probability", limits=c(0,1), breaks = seq(0,1,.25)) +
                    geom_hline(yintercept = 0.5, color = "grey70", linetype = 2) +
                    scale_colour_manual(name = "University", values = c("#0072B2","#88CCEE","#88CCEE"),
                    labels = c("No", "Yes", "")) +
                    scale_alpha_manual(values = c(1, 1, .15)) + 
                    theme_bw() + coord_flip() + facet_grid(. ~ title)

ca2019.rs.preds.loser.g <- ca2019.rs.preds.loser.g + theme(panel.grid.major = element_blank(),
                                             panel.grid.minor = element_blank(),
                                             panel.background = element_blank(),
                                             axis.text.y = element_blank(), 
                                             axis.ticks.y = element_blank(),
                                             axis.title.y = element_blank(),
                                             axis.title.x = element_blank(),
                                             axis.text.x = element_text(size = 14, colour="white"),
                                             axis.ticks = element_line(color = "#00000000"),                                             
                                             strip.text = element_text(size = 15),
                                             plot.title = element_text(size = 12, hjust = 0.5))

ca2019.rs.preds.loser.g

ca2019.rs.preds.winner$title <- "(b) Winners"

ca2019.rs.preds.winner.g <- ggplot(ca2019.rs.preds.winner, aes(x = reorder(id, -id), 
                     y = preds)) + geom_point(aes(color = pr.type, alpha = pr.type), show.legend = FALSE) + 
                     labs(x = "District", y = "Predicted Probability of Correct Forecast") +
                     scale_x_discrete(guide = guide_axis(n.dodge = 2), expand = expansion(mult = c(.02, .02))) +
                     scale_y_continuous("Predicted Probability", limits=c(0,1), breaks = seq(0,1,.25)) +
                     geom_hline(yintercept = 0.5, color = "grey70", linetype = 2) +
                     scale_colour_manual(name = "University", values = c("#0072B2","#88CCEE","#88CCEE"),
                     labels = c("No", "Yes", "")) +
                     scale_alpha_manual(values = c(1, 1, .15)) + 
                     theme_bw() + coord_flip() + facet_grid(. ~ title)

ca2019.rs.preds.winner.g <- ca2019.rs.preds.winner.g + theme(panel.grid.major = element_blank(),
                                               panel.grid.minor = element_blank(),
                                               panel.background = element_blank(),
                                               axis.text.y = element_blank(), 
                                               axis.ticks.y = element_blank(),
                                               axis.title.y = element_blank(),
                                               axis.title.x = element_blank(),
                                               axis.text.x = element_text(size = 14, colour="white"),
                                               axis.ticks = element_line(color = "#00000000"),
                                               strip.text = element_text(size = 15),
                                               plot.title = element_text(size = 12, hjust = 0.5))

ca2019.rs.preds.winner.g

tiff("ca2019_preds_g.tiff", units="in", width=8, height=4, res=300)

ca2019.preds.g <- ggarrange(ca2019.rs.preds.loser.g, 
                           ca2019.rs.preds.winner.g,
                           ncol = 2, nrow = 1)

ca2019.preds.g <- annotate_figure(ca2019.preds.g,
                                 left = textGrob("Electoral District", rot = 90, 
                                                 vjust = .6, gp = gpar(col = "white", cex = 1)),
                                 top = textGrob("Canada 2019", 
                                                   vjust = .5, gp = gpar(cex = 1, fontsize = 18)))

ca2019.preds.g

dev.off()

################################################################################
## RANDOM INTERCEPT VS RANDOM SLOPE
################################################################################

anova(ca2019.ri.fit, ca2019.rs.fit.bob) #put the simpler model first

################################################################################
## MARGINAL EFFECTS
################################################################################

ca2019.rs.univ.pred.prop <- ggeffect(ca2019.rs.fit.bob, terms = c("univ"))
ca2019.rs.vote.pred.prop <- ggeffect(ca2019.rs.fit.bob, terms = c("vote_district"))
ca2019.rs.margin.pred.prop <- ggeffect(ca2019.rs.fit.bob, terms = c("margin"))
ca2019.rs.reelected.pred.prop <- ggeffect(ca2019.rs.fit.bob, terms = c("reelected"))
ca2019.rs.int.pred.prop <- ggeffect(ca2019.rs.fit.bob, terms = c("univ", "vote_district"))

ca2019.rs.univ.pred.ref <- ggpredict(ca2019.rs.fit.bob, terms = c("univ"))
ca2019.rs.vote.pred.ref <- ggpredict(ca2019.rs.fit.bob, terms = c("vote_district"))
ca2019.rs.margin.pred.ref <- ggpredict(ca2019.rs.fit.bob, terms = c("margin"))
ca2019.rs.reelected.pred.ref <- ggpredict(ca2019.rs.fit.bob, terms = c("reelected"))
ca2019.rs.int.pred.ref <- ggpredict(ca2019.rs.fit.bob, terms = c("univ", "vote_district"))

ggeffect(ca2019.rs.fit.bob, terms = c("vote_district", "univ")) %>% plot()
ggpredict(ca2019.rs.fit.bob, terms = c("vote_district", "univ")) %>% plot()